Index

Alignmnent and counting of the fastq files

This step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.

Library loading and set up

suppressMessages(
  c(library(DESeq2),
    library(limma),
    library(tximport),
    library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi),
    library(knitr))
)
##   [1] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##   [4] "matrixStats"          "Biobase"              "GenomicRanges"       
##   [7] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [10] "BiocGenerics"         "parallel"             "stats4"              
##  [13] "stats"                "graphics"             "grDevices"           
##  [16] "utils"                "datasets"             "methods"             
##  [19] "base"                 "limma"                "DESeq2"              
##  [22] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
##  [25] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [28] "IRanges"              "S4Vectors"            "BiocGenerics"        
##  [31] "parallel"             "stats4"               "stats"               
##  [34] "graphics"             "grDevices"            "utils"               
##  [37] "datasets"             "methods"              "base"                
##  [40] "tximport"             "limma"                "DESeq2"              
##  [43] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
##  [46] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [49] "IRanges"              "S4Vectors"            "BiocGenerics"        
##  [52] "parallel"             "stats4"               "stats"               
##  [55] "graphics"             "grDevices"            "utils"               
##  [58] "datasets"             "methods"              "base"                
##  [61] "gridExtra"            "tximport"             "limma"               
##  [64] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [67] "matrixStats"          "Biobase"              "GenomicRanges"       
##  [70] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [73] "BiocGenerics"         "parallel"             "stats4"              
##  [76] "stats"                "graphics"             "grDevices"           
##  [79] "utils"                "datasets"             "methods"             
##  [82] "base"                 "ensembldb"            "AnnotationFilter"    
##  [85] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
##  [88] "tximport"             "limma"                "DESeq2"              
##  [91] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
##  [94] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [97] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [100] "parallel"             "stats4"               "stats"               
## [103] "graphics"             "grDevices"            "utils"               
## [106] "datasets"             "methods"              "base"                
## [109] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [112] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [115] "tximport"             "limma"                "DESeq2"              
## [118] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [121] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [124] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [127] "parallel"             "stats4"               "stats"               
## [130] "graphics"             "grDevices"            "utils"               
## [133] "datasets"             "methods"              "base"                
## [136] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [139] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [142] "gridExtra"            "tximport"             "limma"               
## [145] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [148] "matrixStats"          "Biobase"              "GenomicRanges"       
## [151] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [154] "BiocGenerics"         "parallel"             "stats4"              
## [157] "stats"                "graphics"             "grDevices"           
## [160] "utils"                "datasets"             "methods"             
## [163] "base"                 "ggplot2"              "grid"                
## [166] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [169] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [172] "tximport"             "limma"                "DESeq2"              
## [175] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [178] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [181] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [184] "parallel"             "stats4"               "stats"               
## [187] "graphics"             "grDevices"            "utils"               
## [190] "datasets"             "methods"              "base"                
## [193] "lattice"              "ggplot2"              "grid"                
## [196] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [199] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [202] "tximport"             "limma"                "DESeq2"              
## [205] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [208] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [211] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [214] "parallel"             "stats4"               "stats"               
## [217] "graphics"             "grDevices"            "utils"               
## [220] "datasets"             "methods"              "base"                
## [223] "reshape"              "lattice"              "ggplot2"             
## [226] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [229] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [232] "gridExtra"            "tximport"             "limma"               
## [235] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [238] "matrixStats"          "Biobase"              "GenomicRanges"       
## [241] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [244] "BiocGenerics"         "parallel"             "stats4"              
## [247] "stats"                "graphics"             "grDevices"           
## [250] "utils"                "datasets"             "methods"             
## [253] "base"                 "mixOmics"             "MASS"                
## [256] "reshape"              "lattice"              "ggplot2"             
## [259] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [262] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [265] "gridExtra"            "tximport"             "limma"               
## [268] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [271] "matrixStats"          "Biobase"              "GenomicRanges"       
## [274] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [277] "BiocGenerics"         "parallel"             "stats4"              
## [280] "stats"                "graphics"             "grDevices"           
## [283] "utils"                "datasets"             "methods"             
## [286] "base"                 "gplots"               "mixOmics"            
## [289] "MASS"                 "reshape"              "lattice"             
## [292] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [295] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [298] "AnnotationDbi"        "gridExtra"            "tximport"            
## [301] "limma"                "DESeq2"               "SummarizedExperiment"
## [304] "DelayedArray"         "matrixStats"          "Biobase"             
## [307] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [310] "S4Vectors"            "BiocGenerics"         "parallel"            
## [313] "stats4"               "stats"                "graphics"            
## [316] "grDevices"            "utils"                "datasets"            
## [319] "methods"              "base"                 "RColorBrewer"        
## [322] "gplots"               "mixOmics"             "MASS"                
## [325] "reshape"              "lattice"              "ggplot2"             
## [328] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [331] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [334] "gridExtra"            "tximport"             "limma"               
## [337] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [340] "matrixStats"          "Biobase"              "GenomicRanges"       
## [343] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [346] "BiocGenerics"         "parallel"             "stats4"              
## [349] "stats"                "graphics"             "grDevices"           
## [352] "utils"                "datasets"             "methods"             
## [355] "base"                 "readr"                "RColorBrewer"        
## [358] "gplots"               "mixOmics"             "MASS"                
## [361] "reshape"              "lattice"              "ggplot2"             
## [364] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [367] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [370] "gridExtra"            "tximport"             "limma"               
## [373] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [376] "matrixStats"          "Biobase"              "GenomicRanges"       
## [379] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [382] "BiocGenerics"         "parallel"             "stats4"              
## [385] "stats"                "graphics"             "grDevices"           
## [388] "utils"                "datasets"             "methods"             
## [391] "base"                 "dplyr"                "readr"               
## [394] "RColorBrewer"         "gplots"               "mixOmics"            
## [397] "MASS"                 "reshape"              "lattice"             
## [400] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [403] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [406] "AnnotationDbi"        "gridExtra"            "tximport"            
## [409] "limma"                "DESeq2"               "SummarizedExperiment"
## [412] "DelayedArray"         "matrixStats"          "Biobase"             
## [415] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [418] "S4Vectors"            "BiocGenerics"         "parallel"            
## [421] "stats4"               "stats"                "graphics"            
## [424] "grDevices"            "utils"                "datasets"            
## [427] "methods"              "base"                 "VennDiagram"         
## [430] "futile.logger"        "dplyr"                "readr"               
## [433] "RColorBrewer"         "gplots"               "mixOmics"            
## [436] "MASS"                 "reshape"              "lattice"             
## [439] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [442] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [445] "AnnotationDbi"        "gridExtra"            "tximport"            
## [448] "limma"                "DESeq2"               "SummarizedExperiment"
## [451] "DelayedArray"         "matrixStats"          "Biobase"             
## [454] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [457] "S4Vectors"            "BiocGenerics"         "parallel"            
## [460] "stats4"               "stats"                "graphics"            
## [463] "grDevices"            "utils"                "datasets"            
## [466] "methods"              "base"                 "clusterProfiler"     
## [469] "VennDiagram"          "futile.logger"        "dplyr"               
## [472] "readr"                "RColorBrewer"         "gplots"              
## [475] "mixOmics"             "MASS"                 "reshape"             
## [478] "lattice"              "ggplot2"              "grid"                
## [481] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [484] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [487] "tximport"             "limma"                "DESeq2"              
## [490] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [493] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [496] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [499] "parallel"             "stats4"               "stats"               
## [502] "graphics"             "grDevices"            "utils"               
## [505] "datasets"             "methods"              "base"                
## [508] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [511] "futile.logger"        "dplyr"                "readr"               
## [514] "RColorBrewer"         "gplots"               "mixOmics"            
## [517] "MASS"                 "reshape"              "lattice"             
## [520] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [523] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [526] "AnnotationDbi"        "gridExtra"            "tximport"            
## [529] "limma"                "DESeq2"               "SummarizedExperiment"
## [532] "DelayedArray"         "matrixStats"          "Biobase"             
## [535] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [538] "S4Vectors"            "BiocGenerics"         "parallel"            
## [541] "stats4"               "stats"                "graphics"            
## [544] "grDevices"            "utils"                "datasets"            
## [547] "methods"              "base"                 "org.Mm.eg.db"        
## [550] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [553] "futile.logger"        "dplyr"                "readr"               
## [556] "RColorBrewer"         "gplots"               "mixOmics"            
## [559] "MASS"                 "reshape"              "lattice"             
## [562] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [565] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [568] "AnnotationDbi"        "gridExtra"            "tximport"            
## [571] "limma"                "DESeq2"               "SummarizedExperiment"
## [574] "DelayedArray"         "matrixStats"          "Biobase"             
## [577] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [580] "S4Vectors"            "BiocGenerics"         "parallel"            
## [583] "stats4"               "stats"                "graphics"            
## [586] "grDevices"            "utils"                "datasets"            
## [589] "methods"              "base"                 "pathview"            
## [592] "org.Hs.eg.db"         "org.Mm.eg.db"         "DOSE"                
## [595] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [598] "dplyr"                "readr"                "RColorBrewer"        
## [601] "gplots"               "mixOmics"             "MASS"                
## [604] "reshape"              "lattice"              "ggplot2"             
## [607] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [610] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [613] "gridExtra"            "tximport"             "limma"               
## [616] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [619] "matrixStats"          "Biobase"              "GenomicRanges"       
## [622] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [625] "BiocGenerics"         "parallel"             "stats4"              
## [628] "stats"                "graphics"             "grDevices"           
## [631] "utils"                "datasets"             "methods"             
## [634] "base"                 "pathview"             "org.Hs.eg.db"        
## [637] "org.Mm.eg.db"         "DOSE"                 "clusterProfiler"     
## [640] "VennDiagram"          "futile.logger"        "dplyr"               
## [643] "readr"                "RColorBrewer"         "gplots"              
## [646] "mixOmics"             "MASS"                 "reshape"             
## [649] "lattice"              "ggplot2"              "grid"                
## [652] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [655] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [658] "tximport"             "limma"                "DESeq2"              
## [661] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [664] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [667] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [670] "parallel"             "stats4"               "stats"               
## [673] "graphics"             "grDevices"            "utils"               
## [676] "datasets"             "methods"              "base"                
## [679] "knitr"                "pathview"             "org.Hs.eg.db"        
## [682] "org.Mm.eg.db"         "DOSE"                 "clusterProfiler"     
## [685] "VennDiagram"          "futile.logger"        "dplyr"               
## [688] "readr"                "RColorBrewer"         "gplots"              
## [691] "mixOmics"             "MASS"                 "reshape"             
## [694] "lattice"              "ggplot2"              "grid"                
## [697] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [700] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [703] "tximport"             "limma"                "DESeq2"              
## [706] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [709] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [712] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [715] "parallel"             "stats4"               "stats"               
## [718] "graphics"             "grDevices"            "utils"               
## [721] "datasets"             "methods"              "base"

Analysis of all samples

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count sf files

# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79 
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$gene_id))

# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Gene Counts/TCT")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
##  [1] "I_MKI1_3963_LIB037245_GEN00137855_R1.sf"   
##  [2] "I_MKI2_1711_LIB037245_GEN00137858_R1.sf"   
##  [3] "I_MKI3_3504_LIB041682_GEN00156317_R1.sf"   
##  [4] "I_MKI4_3501_LIB041682_GEN00156318_R1.sf"   
##  [5] "I_MKI5_3503_LIB041682_GEN00156319_R1.sf"   
##  [6] "I_MKI6_LS78_LIB044271_GEN00169444_R1.sf"   
##  [7] "I_V1_3999_LIB041682_GEN00156316_R1.sf"     
##  [8] "I_V2_KL3687_LIB044271_GEN00169448_R1.sf"   
##  [9] "I_V3_KL3693_LIB044271_GEN00169449_R1.sf"   
## [10] "I_V4_KL3823_LIB044271_GEN00169451_R1.sf"   
## [11] "I_V5_KL3912_LIB044271_GEN00169453_R1.sf"   
## [12] "I_V6_LS79_LIB044271_GEN00169445_R1.sf"     
## [13] "NI_MKI1_3976_LIB037245_GEN00137847_R1.sf"  
## [14] "NI_MKI2_3960_LIB037245_GEN00137848_R1.sf"  
## [15] "NI_MKI3_3561_LIB037245_GEN00137849_R1.sf"  
## [16] "NI_MKI4_3462_LIB037245_GEN00137850_R1.sf"  
## [17] "NI_MKI5_3505_LIB041682_GEN00156321_R1.sf"  
## [18] "NI_MKI6_KL3695_LIB044271_GEN00169450_R1.sf"
## [19] "NI_V1_3974_LIB037245_GEN00137843_R1.sf"    
## [20] "NI_V2_3975_LIB037245_GEN00137844_R1.sf"    
## [21] "NI_V3_3962_LIB037245_GEN00137845_R1.sf"    
## [22] "NI_V4_3461_LIB037245_GEN00137846_R1.sf"    
## [23] "NI_V5_3914_LIB041682_GEN00156320_R1.sf"    
## [24] "NI_V6_KL3688_LIB044271_GEN00169446_R1.sf"  
## [25] "NI_V7_KL3832_LIB044271_GEN00169452_R1.sf"
names(countFiles) <- c('I_MKI_1','I_MKI_2','I_MKI_3','I_MKI_4','I_MKI_5','I_MKI_6', 'I_V_1', 'I_V_2', 'I_V_3', 'I_V_4', 'I_V_5', 'I_V_6', 'NI_MKI_1','NI_MKI_2','NI_MKI_3','NI_MKI_4', 'NI_MKI_5', 'NI_MKI_6', 'NI_V_1', 'NI_V_2', 'NI_V_3', 'NI_V_4', 'NI_V_5', 'NI_V_6', 'NI_V_7')

countFiles_sum <- countFiles[-c(21,23)]

txi.salmon <- tximport(countFiles_sum, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all <- data.frame(condition = c(rep('experimental', 12), rep('control', 13)))

DESeqsampletable_all$batch <- factor(c('1','1','3','3','3','4','3','4','4','4','4','4','1','1','1','1','3','4','1','1','1','1','3','4','4'))
DESeqsampletable_all$gender <- factor(c('F','F','M','F','M','F','F','M','F','M','F','M','F','F','F','M','M','M','F','F','F','F','M','M','M'))

DESeqsampletable_sum <- DESeqsampletable_all[-c(21,23),]
rownames(DESeqsampletable_sum) <- colnames(txi.salmon$counts)

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable_sum, ~ batch + gender + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Quality Inspection of the Gene Count Data

Generate raw count table that contains the simple counts of each gene

Data is transformed and pseudocount is calculated.

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
  ggplot(pseudoCount, aes(x= I_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_1"), 
  ggplot(pseudoCount, aes(x= I_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_2"),
  ggplot(pseudoCount, aes(x= I_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_3"),
  ggplot(pseudoCount, aes(x= I_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_4"),
  ggplot(pseudoCount, aes(x= I_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_5"),
  ggplot(pseudoCount, aes(x= I_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_6"), 
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= I_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_1"),
  ggplot(pseudoCount, aes(x= I_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_2"),
  ggplot(pseudoCount, aes(x= I_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_3"),
  ggplot(pseudoCount, aes(x= I_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_4"),
  ggplot(pseudoCount, aes(x= I_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_5"),
  ggplot(pseudoCount, aes(x= I_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_6"),
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_1"),
  ggplot(pseudoCount, aes(x= NI_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_2"),
  ggplot(pseudoCount, aes(x= NI_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_3"),
  ggplot(pseudoCount, aes(x= NI_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_4"),
  ggplot(pseudoCount, aes(x= NI_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_5"),
  ggplot(pseudoCount, aes(x= NI_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_6"),
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_1"),
  ggplot(pseudoCount, aes(x= NI_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_2"),
  ggplot(pseudoCount, aes(x= NI_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_4"),
  ggplot(pseudoCount, aes(x= NI_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_6"),
  ggplot(pseudoCount, aes(x= NI_V_7)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_7"),
  nrow = 2)

Between-sample distribution

Check on the gene count distribution across all genes.

#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))

ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3", "#E69F00", "#FF0000")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:12] != 0) >= 2 | sum(rawCountTable[i,13:18] != 0) >= 2 | sum(rawCountTable[i,19:23] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_TCT.csv")

pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-50, 50) + ylim(-30, 45)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_text).

I_V vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count sf files

# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[19:25], countFiles[7:12])

countFiles_vcompare <- countFiles_vcompare[c(-3,-5)]

txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 7:12), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_V vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_I_V vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_I_V vs NI_V.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)

pdf("PCA_TCT_IV-vs-NIV.pdf",
    height = 6,
    width = 8)
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_V samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected_iv_vs_niv <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_iv_vs_niv, "VST_I_V vs NI_V.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_iv_vs_niv)))
}
sig_count <- rawCountTable_transform_detected_iv_vs_niv[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

my_palette <- colorRampPalette(c("blue", "white", "red"))(128)
heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('I_V vs NI_V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('I_V vs NI_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('I_V vs NI_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 309
write.csv(sig_dif, "Significant genes_I_V vs NI_V.csv")

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
  xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_V vs NI_V Scatter Plot"))

pdf('ScatterPlot_I_V vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(ma <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot"))

pdf('MAPlot_I_V vs NI_V.pdf')
ma
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot"))
## Warning: Removed 26 rows containing missing values (geom_point).

pdf('VolcanoPlot_I_V vs NI_V.pdf')
volcano
## Warning: Removed 26 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

GO analysis for DE genes

Classic GO analysis is performed here for all DE genes detected in this dataset. The reference list is list of genes detected in RNASeq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.

target_gene <- as.character(rownames(sig_dif))
detected_gene <- as.character(rownames(detected_pseudocount))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO analysis/GO analysis_BP_I_V vs NI_V.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO analysis/GO analysis_MF_I_V vs NI_V.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO analysis/GO analysis_CC_I_V vs NI_V.csv")

Draw Dotplot representing the results

Biological process
png('GO analysis/GO dotplot_BP_I_V vs NI_V.png',
    width = 800,
    height = 400,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO dotplot_BP_I_V vs NI_V.pdf',
    width = 10,
    height = 4)
dotplot(ego_BP, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO dotplot_BP_I_V vs NI_V.png')

Molecular Function
png('GO analysis/GO dotplot_MF_I_V vs NI_V.png',
    width = 800,
    height = 400,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO dotplot_MF_I_V vs NI_V.pdf',
    width = 8,
    height = 4)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO dotplot_MF_I_V vs NI_V.png')

Cellular component
png('GO analysis/GO dotplot_CC_I_V vs NI_V.png',
    width = 800,
    height = 400,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO dotplot_CC_I_V vs NI_V.pdf',
    width = 8,
    height = 4)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO dotplot_CC_I_V vs NI_V.png')

Draw enrichment Go plot representing the results

Biological process
png('GO analysis/GO enrichment_BP_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO enrichment_BP_I_V vs NI_V.pdf',
    width = 16,
    height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO enrichment_BP_I_V vs NI_V.png')

Molecular function
png('GO analysis/GO enrichment_MF_I_V vs NI_V.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO enrichment_MF_I_V vs NI_V.pdf',
    width = 12,
    height = 12)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO enrichment_MF_I_V vs NI_V.png')

Cellular component
png('GO analysis/GO enrichment_CC_I_V vs NI_V.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO enrichment_CC_I_V vs NI_V.pdf',
    width = 10,
    height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO enrichment_CC_I_V vs NI_V.png')

Draw category netplot representing the results

The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). ##### Biological process

OE_foldchanges <- sig_dif$log2FoldChange
names(OE_foldchanges) <- rownames(sig_dif)

png('GO analysis/GO cnetplot_BP_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO cnetplot_BP_I_V vs NI_V.pdf',
    width = 16,
    height = 16)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO cnetplot_BP_I_V vs NI_V.png')

Molecular function
png('GO analysis/GO cnetplot_MF_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO cnetplot_MF_I_V vs NI_V.pdf',
    width = 16,
    height = 16)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO cnetplot_MF_I_V vs NI_V.png')

Cellular component
png('GO analysis/GO cnetplot_CC_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('GO analysis/GO cnetplot_CC_I_V vs NI_V.pdf',
    width = 16,
    height = 16)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('GO analysis/GO cnetplot_CC_I_V vs NI_V.png')

I_V vs NI_V_filtered

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

I am filtering out I_V3.

# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[19:25], countFiles[7:12])
countFiles_vcompare <- countFiles_vcompare[c(-3,-5,-10)]

txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 7:12), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5,-10),]

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_V vs NI_V_filtered.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-50, 55) + ylim(-30, 50)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V_filtered.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V_filtered.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_V samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_V vs NI_V_filtered.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:16])

png('I_V vs NI_V RNASeq_filtered.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 433
write.csv(sig_dif, "Significant genes_I_V vs NI_V_filtered.csv")

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
  xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_V vs NI_V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot")
## Warning: Removed 12 rows containing missing values (geom_point).

I_MKI vs I_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:12], countFiles[1:6])

txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:12, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 6), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_MKI vs I_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_I_MKI vs I_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_I_MKI vs I_V.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))

pdf("PCA_TCT_IMKi-vs-IV.pdf",
    height = 6,
    width = 8)
PCA
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

I’m applying a different filtering criteria here. After discussing with Lucia, we decided to only look at genes that are expressed in 3 or more samples in each group.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:12] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_MKI samples compared to I_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

Here I relax the padj cutoff to 0.1

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V.csv")


dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:18] <- zscore(as.numeric(sig_dif[i,7:18]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:18])

png('I_MKI vs I_V RNASeq.png',
    width = 600,
    height = 800,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('I_MKI vs I_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('I_MKI vs I_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 17
write.csv(sig_dif, "Significant genes_I_MKI vs I_V.csv")

Draw heatmap of inflamed MK2i targets in NI_V vs I_V vs I_MK2i comparison of TCT and TNF

TCT

# rearrange the TCT master count matrix
rawCountTable_transform_detected_TCT <- read_csv("VST_TCT.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TCT$X1
rawCountTable_transform_detected_TCT$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TCT[,19:23], rawCountTable_transform_detected_TCT[,7:12], rawCountTable_transform_detected_TCT[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names

# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:23] <- zscore(as.numeric(sig_dif[i,7:23]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:23])

png('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png',
    width = 600,
    height = 700,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "TCT MK2i targets\nTCT I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.pdf',
    width = 9,
    height = 11)
heatmap.2(heatmap_matrix,
          main = "TCT MK2i targets\nTCT NI_V vs I_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png')

TNF

# rearrange the TCT master count matrix
rawCountTable_transform_detected_TNF <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF/VST_TNF.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TNF$X1
rawCountTable_transform_detected_TNF$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TNF[,17:21], rawCountTable_transform_detected_TNF[,7:11], rawCountTable_transform_detected_TNF[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names

# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:22] <- zscore(as.numeric(sig_dif[i,7:22]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:22])

png('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png',
    width = 600,
    height = 700,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "TCT MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.pdf',
    width = 9,
    height = 11)
heatmap.2(heatmap_matrix,
          main = "TCT MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png')

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,7:12])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:6])
(scatter <- ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
  xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_MKI vs I_V Scatter Plot"))

pdf('ScatterPlot_I_MKI vs I_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot"))

pdf('MAPlot_I_MKI vs I_V.pdf')
MA
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot"))
## Warning: Removed 44 rows containing missing values (geom_point).

pdf('VolcanoPlot_I_MKI vs I_V.pdf')
volcano
## Warning: Removed 44 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

I_MKI vs I_V_filtered

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

I am filtering out I_V3. Which is the weird one.

# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:12], countFiles[1:6])
countFiles_icompare <- countFiles_icompare[-3]

txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:12, 1:6), ]
DESeqsampletable <- DESeqsampletable[-3,]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_MKI vs I_V_filtered.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-50, 40) + ylim(-25, 30)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

I’m applying a different filtering criteria here. After discussing with Lucia, we decided to only look at genes that are expressed in 3 or more samples in each group.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V_filtered.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V_filtered.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_MKI samples compared to I_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

Here I relax the padj cutoff to 0.1

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V_filtered.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('I_MKI vs I_V RNASeq_filtered.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "both",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 23
write.csv(sig_dif, "Significant genes_I_MKI vs I_V_filtered.csv")

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
  xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_MKI vs I_V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot")
## Warning: Removed 38 rows containing missing values (geom_point).

NI_MKI vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_nicompare <- c(countFiles[19:25], countFiles[13:18])
countFiles_nicompare <- countFiles_nicompare[c(-3,-5)]

txi.salmon <- tximport(countFiles_nicompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 13:18), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ batch + condition + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_NI_MKI vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_NI_MKI vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_NI_MKI vs NI_V.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))

pdf('PCA_TCT_NIMKi-vs-NIV.pdf',
    height = 6,
    width = 8)
PCA
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_NI_MKI vs NI_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_NI_MKI vs NI_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in NI_MKI samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

Here I relaxed the padj cutoff to 0.1 (actually this didn’t change the number of DE genes, it’s the same as padj as 0.05)

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_NI_MKI vs NI_V.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('NI_MKI vs NI_V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.1",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('NI_MKI vs NI_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.1",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('NI_MKI vs NI_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 9
write.csv(sig_dif, "Significant genes_NI_MKI vs NI_V.csv")

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$NI_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = NI_MKI_mean)) +
  xlab("NI_V_Average(log2)") + ylab("NI_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "NI_MKI vs NI_V Scatter Plot"))

pdf('ScatterPlot_NI_MKI vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V MA Plot"))

pdf('MAPlot_NI_MKI vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Volcano Plot"))
## Warning: Removed 54 rows containing missing values (geom_point).

pdf('VolcanoPlot_NI_MKI vs NI_V.pdf')
volcano
## Warning: Removed 54 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

all_MKI vs all_V

# Generate DESeqData using the counting result generated using Salmon

countFiles_all <- c(countFiles[19:25], countFiles[7:12], countFiles[13:18], countFiles[1:6])
countFiles_all  <- countFiles_all[-c(3,5)]

txi.salmon <- tximport(countFiles_all, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all$condition <- c(rep("experimental", 6), rep("control", 6), rep("experimental",6), rep("control", 7))
DESeqsampletable <- rbind(DESeqsampletable_all[19:25,], DESeqsampletable_all[7:12,], DESeqsampletable_all[13:18,], DESeqsampletable_all[1:6,])
DESeqsampletable <- DESeqsampletable[-c(3,5),]

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_MKI vs V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)

  #geom_text(aes(label=name), vjust = 2) 
  #xlim(-45, 60) + ylim(-30, 80)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2 | sum(rawCountTable[i,12:17] != 0) >= 2 | sum(rawCountTable[i,18:23] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_MKI vs V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_MKI vs V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_V samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_MKI vs V.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:29] <- zscore(as.numeric(sig_dif[i,7:29]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:29])

png('MKI vs V RNASeq.png',
    width = 800,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "MKI vs V RNASeq\npadj < 0.1\nLFC > 0.1",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('MKI vs V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 9
write.csv(sig_dif, "Significant genes_MKI vs V.csv")

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$MKI_mean <- rowMeans(detected_pseudocount[,12:23])
dif_analysis$V_mean <- rowMeans(detected_pseudocount[,1:11])
ggplot(dif_analysis, aes(x = V_mean, y = MKI_mean)) +
  xlab("V_Average(log2)") + ylab("MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "MKI vs V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Volcano Plot")
## Warning: Removed 71 rows containing missing values (geom_point).

sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] mosaic_1.6.0                Matrix_1.2-18              
##  [3] mosaicData_0.17.0           ggformula_0.9.4            
##  [5] ggstance_0.3.4              knitr_1.28                 
##  [7] pathview_1.28.0             org.Hs.eg.db_3.11.1        
##  [9] org.Mm.eg.db_3.11.1         DOSE_3.14.0                
## [11] clusterProfiler_3.16.0      VennDiagram_1.6.20         
## [13] futile.logger_1.4.3         dplyr_0.8.5                
## [15] readr_1.3.1                 RColorBrewer_1.1-2         
## [17] gplots_3.0.3                mixOmics_6.12.0            
## [19] MASS_7.3-51.6               reshape_0.8.8              
## [21] lattice_0.20-41             ggplot2_3.3.0              
## [23] EnsDb.Mmusculus.v79_2.99.0  ensembldb_2.12.1           
## [25] AnnotationFilter_1.12.0     GenomicFeatures_1.40.0     
## [27] AnnotationDbi_1.50.0        gridExtra_2.3              
## [29] tximport_1.16.0             limma_3.44.1               
## [31] DESeq2_1.28.0               SummarizedExperiment_1.18.1
## [33] DelayedArray_0.14.0         matrixStats_0.56.0         
## [35] Biobase_2.48.0              GenomicRanges_1.40.0       
## [37] GenomeInfoDb_1.24.0         IRanges_2.22.1             
## [39] S4Vectors_0.26.0            BiocGenerics_0.34.0        
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.0.0         htmlwidgets_1.5.1        RSQLite_2.2.0           
##   [4] BiocParallel_1.22.0      scatterpie_0.1.4         munsell_0.5.0           
##   [7] withr_2.2.0              colorspace_1.4-1         GOSemSim_2.14.0         
##  [10] labeling_0.3             KEGGgraph_1.48.0         urltools_1.7.3          
##  [13] GenomeInfoDbData_1.2.3   polyclip_1.10-0          bit64_0.9-7             
##  [16] farver_2.0.3             downloader_0.4           vctrs_0.2.4             
##  [19] generics_0.0.2           lambda.r_1.2.4           xfun_0.13               
##  [22] BiocFileCache_1.12.0     R6_2.4.1                 graphlayouts_0.7.0      
##  [25] locfit_1.5-9.4           bitops_1.0-6             fgsea_1.14.0            
##  [28] gridGraphics_0.5-0       assertthat_0.2.1         scales_1.1.0            
##  [31] ggraph_2.0.2             enrichplot_1.8.1         gtable_0.3.0            
##  [34] tidygraph_1.1.2          rlang_0.4.6              genefilter_1.70.0       
##  [37] splines_4.0.0            rtracklayer_1.48.0       lazyeval_0.2.2          
##  [40] broom_0.5.6              europepmc_0.3            mosaicCore_0.6.0        
##  [43] BiocManager_1.30.10      yaml_2.2.1               reshape2_1.4.4          
##  [46] crosstalk_1.1.0.1        backports_1.1.6          qvalue_2.20.0           
##  [49] tools_4.0.0              ggplotify_0.0.5          ellipsis_0.3.0          
##  [52] ggdendro_0.1-20          ggridges_0.5.2           Rcpp_1.0.4.6            
##  [55] plyr_1.8.6               progress_1.2.2           zlibbioc_1.34.0         
##  [58] purrr_0.3.4              RCurl_1.98-1.2           prettyunits_1.1.1       
##  [61] openssl_1.4.1            viridis_0.5.1            cowplot_1.0.0           
##  [64] ggrepel_0.8.2            magrittr_1.5             data.table_1.12.8       
##  [67] RSpectra_0.16-0          futile.options_1.0.1     DO.db_2.9               
##  [70] triebeard_0.3.0          ProtGenerics_1.20.0      hms_0.5.3               
##  [73] evaluate_0.14            xtable_1.8-4             XML_3.99-0.3            
##  [76] leaflet_2.0.3            compiler_4.0.0           biomaRt_2.44.0          
##  [79] ellipse_0.4.1            tibble_3.0.1             KernSmooth_2.23-17      
##  [82] crayon_1.3.4             htmltools_0.4.0          corpcor_1.6.9           
##  [85] tidyr_1.0.3              geneplotter_1.66.0       DBI_1.1.0               
##  [88] tweenr_1.0.1             formatR_1.7              dbplyr_1.4.3            
##  [91] rappdirs_0.3.1           gdata_2.18.0             igraph_1.2.5            
##  [94] pkgconfig_2.0.3          rvcheck_0.1.8            GenomicAlignments_1.24.0
##  [97] xml2_1.3.2               rARPACK_0.11-0           annotate_1.66.0         
## [100] XVector_0.28.0           stringr_1.4.0            digest_0.6.25           
## [103] graph_1.66.0             Biostrings_2.56.0        rmarkdown_2.1           
## [106] fastmatch_1.1-0          curl_4.3                 Rsamtools_2.4.0         
## [109] gtools_3.8.2             lifecycle_0.2.0          nlme_3.1-147            
## [112] jsonlite_1.6.1           viridisLite_0.3.0        askpass_1.1             
## [115] pillar_1.4.4             KEGGREST_1.28.0          httr_1.4.1              
## [118] survival_3.1-12          GO.db_3.11.1             glue_1.4.0              
## [121] png_0.1-7                bit_1.1-15.2             Rgraphviz_2.32.0        
## [124] ggforce_0.3.1            stringi_1.4.6            blob_1.2.1              
## [127] caTools_1.18.0           memoise_1.1.0